Wine Quality

Team 6

10/17/2022

1 Importing libraries and dataset

Importing the libraries

library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded

Importing the dataset

library(ggplot2)
wine = read.csv("../dataset/wine-quality-white-and-red.csv")
wine[, 'type'] <- as.factor(wine[, 'type'])
str(wine)
## 'data.frame':    6497 obs. of  13 variables:
##  $ type                : Factor w/ 2 levels "red","white": 2 2 2 2 2 2 2 2 2 2 ...
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
nrow(wine)
## [1] 6497
ncol(wine)
## [1] 13

Getting the summary of the dataset

summary(wine)
##     type      fixed.acidity    volatile.acidity  citric.acid    
##  red  :1599   Min.   : 3.800   Min.   :0.0800   Min.   :0.0000  
##  white:4898   1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500  
##               Median : 7.000   Median :0.2900   Median :0.3100  
##               Mean   : 7.215   Mean   :0.3397   Mean   :0.3186  
##               3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900  
##               Max.   :15.900   Max.   :1.5800   Max.   :1.6600  
##  residual.sugar     chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   : 0.600   Min.   :0.00900   Min.   :  1.00      Min.   :  6.0       
##  1st Qu.: 1.800   1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0       
##  Median : 3.000   Median :0.04700   Median : 29.00      Median :118.0       
##  Mean   : 5.443   Mean   :0.05603   Mean   : 30.53      Mean   :115.7       
##  3rd Qu.: 8.100   3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0       
##  Max.   :65.800   Max.   :0.61100   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9923   1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50  
##  Median :0.9949   Median :3.210   Median :0.5100   Median :10.30  
##  Mean   :0.9947   Mean   :3.219   Mean   :0.5313   Mean   :10.49  
##  3rd Qu.:0.9970   3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30  
##  Max.   :1.0390   Max.   :4.010   Max.   :2.0000   Max.   :14.90  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.818  
##  3rd Qu.:6.000  
##  Max.   :9.000
wine <- wine[!duplicated(wine), ]
nrow(wine)
## [1] 5320
set.seed(42)
rows <- sample(nrow(wine))
wine <- wine[rows, ]
nrow(wine)
## [1] 5320

Checking for null values

colSums(is.na(wine))
##                 type        fixed.acidity     volatile.acidity 
##                    0                    0                    0 
##          citric.acid       residual.sugar            chlorides 
##                    0                    0                    0 
##  free.sulfur.dioxide total.sulfur.dioxide              density 
##                    0                    0                    0 
##                   pH            sulphates              alcohol 
##                    0                    0                    0 
##              quality 
##                    0

Combining the red and white wine into a single dataset and removing the duplicates.

head(wine)
##       type fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 white           6.5            0.240        0.38            1.0     0.027
## 5017   red           8.8            0.550        0.04            2.2     0.119
## 2875 white           5.4            0.230        0.36            1.5     0.030
## 6442   red          11.1            0.440        0.42            2.2     0.064
## 1301 white           9.0            0.245        0.38            5.9     0.045
## 1486 white           7.4            0.280        0.49            1.5     0.034
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 5017                  14                   56 0.99620 3.21      0.60    10.9
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 6442                  14                   19 0.99758 3.25      0.57    10.4
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
##      quality
## 3181       6
## 5017       6
## 2875       7
## 6442       6
## 1301       6
## 1486       6

2 EDA

Showing the quality of wines in white and red wines.

ggplot(wine, aes(x=quality, color=type)) +
  geom_histogram(fill="white", position="dodge")+
  scale_x_continuous(limits = c(3, 9), breaks = seq(3, 9, 1)) +
  theme(legend.position="top")+
  xlab('Quality of Wine') +
  ylab('Number of Wines')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Secondly, we would like to see how each independent variables differ from white and red wine.

  1. fixed.acidity
ggplot(wine, aes(x=fixed.acidity, color=type)) +
  geom_boxplot()

  1. volatile.acidity
ggplot(wine, aes(x=volatile.acidity, color=type)) +
  geom_boxplot()

3.citric.acid

ggplot(wine, aes(x=citric.acid, color=type)) +
  geom_boxplot()

4.residual.sugar

ggplot(wine, aes(x=residual.sugar, color=type)) +
  geom_boxplot()

5.chlorides

ggplot(wine, aes(x=chlorides, color=type)) +
  geom_boxplot()

6.free.sulfur.dioxide

ggplot(wine, aes(x=free.sulfur.dioxide, color=type)) +
  geom_boxplot()

7.total.sulfur.dioxide

ggplot(wine, aes(x=total.sulfur.dioxide, color=type)) +
  geom_boxplot()

8.density

ggplot(wine, aes(x=density, color=type)) +
  geom_boxplot()

9.pH

ggplot(wine, aes(x=pH, color=type)) +
  geom_boxplot()

10.sulphates

ggplot(wine, aes(x=sulphates, color=type)) +
  geom_boxplot()

11.alcohol

ggplot(wine, aes(x=alcohol, color=type)) +
  geom_boxplot()

From the boxplots, we can see that the variation is differed significantly in following variables: fixed.acidity residual.sugar total.sulfur.oxide free.sulfur.dioxide chlorides volatile.acidity

wine_noca = subset(wine, select = -c(type) )
wine_nocacor = cor(wine_noca )
corrplot(wine_nocacor,type="upper")

As we can see from the correlation diagram, the below variables are mostly corrleated to the quality. And we would like to dig further to see the effect quality vs alcohol quality vs density quality vs volatile.acidity quality vs chlorides

quality vs alcohol quality vs sulphates quality vs citric.acid quality vs volatile.acidity

We would anaysis in follwing steps: 1. normality test - qq plot & histagram 2. correlation test 3. scatter plot&boxplot- check the relationship 4. Annova test - compare means of our attribute across the wines and check if differences are statistically significant compared to the quality of a wine.

1.quality vs alcohol The distribution of alcohol data is right skewed

qqnorm(wine$alcohol, pch = 1, frame = FALSE)
qqline(wine$alcohol, col = "steelblue", lwd = 2)

ggplot(wine, aes(x=alcohol)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(wine$alcohol,wine$quality)
## 
##  Pearson's product-moment correlation
## 
## data:  wine$alcohol and wine$quality
## t = 38.769, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4482031 0.4901119
## sample estimates:
##       cor 
## 0.4694218
ggplot(wine, aes(x=alcohol, y=quality, color=type, shape = type)) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  ggtitle("alcohol vs Quality")
## `geom_smooth()` using formula = 'y ~ x'

wine_cat <- wine
wine_cat$quality <- factor(wine_cat$quality)
ggplot(wine_cat, aes(x=quality, y=alcohol, fill=type)) + 
    geom_boxplot() +
    facet_wrap(~type)+
    ggtitle("alcohol vs Quality")

anova_alcohol = aov(alcohol ~ quality, data=wine_cat)
summary(anova_alcohol)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## quality        6   1933   322.1   308.4 <2e-16 ***
## Residuals   5313   5548     1.0                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.quality vs density Density plot looks normally distributed

qqnorm(wine$density, pch = 1, frame = FALSE)
qqline(wine$density, col = "steelblue", lwd = 2)

ggplot(wine, aes(x=density)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(wine$density,wine$quality)
## 
##  Pearson's product-moment correlation
## 
## data:  wine$density and wine$quality
## t = -25.185, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3502348 -0.3022129
## sample estimates:
##        cor 
## -0.3264345
ggplot(wine, aes(x=density, y=quality, color=type, shape = type)) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  ggtitle("density vs Quality")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(wine_cat, aes(x=quality, y=density, fill=type)) + 
    geom_boxplot() +
    facet_wrap(~type)+
    ggtitle("density vs Quality")

anova_density = aov(density ~ quality, data=wine_cat)
summary(anova_density)
##               Df  Sum Sq   Mean Sq F value Pr(>F)    
## quality        6 0.00600 0.0010007   130.4 <2e-16 ***
## Residuals   5313 0.04077 0.0000077                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.quality vs volatile.acidity The distribution of volatile.acidity data is almost normal,However, there is a small tail on the right side of the plot

qqnorm(wine$volatile.acidity, pch = 1, frame = FALSE)
qqline(wine$volatile.acidity, col = "steelblue", lwd = 2)

ggplot(wine, aes(x=volatile.acidity)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(wine$volatile.acidity,wine$quality)
## 
##  Pearson's product-moment correlation
## 
## data:  wine$volatile.acidity and wine$quality
## t = -20.058, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2900109 -0.2400432
## sample estimates:
##        cor 
## -0.2652051
ggplot(wine, aes(x=volatile.acidity, y=quality, color=type, shape = type)) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  ggtitle("volatile.acidity vs Quality")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(wine_cat, aes(x=quality, y=volatile.acidity, fill=type)) + 
    geom_boxplot() +
    facet_wrap(~type)+
    ggtitle("volatile.acidity vs Quality")

anova_volatile.acidity = aov(volatile.acidity ~ quality, data=wine_cat)
summary(anova_volatile.acidity)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## quality        6  12.57   2.096   80.68 <2e-16 ***
## Residuals   5313 137.99   0.026                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.quality vs chlorides The distribution looks like normally distributed but is also right skewed

qqnorm(wine$density, pch = 1, frame = FALSE)
qqline(wine$density, col = "steelblue", lwd = 2)

ggplot(wine, aes(x=density)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(wine$density,wine$quality)
## 
##  Pearson's product-moment correlation
## 
## data:  wine$density and wine$quality
## t = -25.185, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3502348 -0.3022129
## sample estimates:
##        cor 
## -0.3264345
ggplot(wine, aes(x=chlorides,y=quality, color=type, shape = type)) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  ggtitle("chlorides vs Quality")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(wine_cat, aes(x=quality, y=chlorides, fill=type)) + 
    geom_boxplot() +
    facet_wrap(~type)+
    ggtitle("chlorides vs Quality")

anova_chlorides = aov(chlorides ~ quality, data=wine_cat)
summary(anova_chlorides)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## quality        6  0.337 0.05623   43.35 <2e-16 ***
## Residuals   5313  6.891 0.00130                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3 Modeling

for (xx in 1:(length(wine)-2) ) {
  for (yy in (xx+1):(length(wine)-1) ) {
    print(xx)
    print(yy)
    p <- ggplot(wine, aes(x=wine[,xx], y=wine[,yy], color=quality)) +
      geom_point() +
      labs( x = colnames(wine)[xx], y = colnames(wine)[yy], title = paste(colnames(wine)[yy],"vs",colnames(wine)[xx]) )
    print(p)
  }
}
## [1] 1
## [1] 2

## [1] 1
## [1] 3

## [1] 1
## [1] 4

## [1] 1
## [1] 5

## [1] 1
## [1] 6

## [1] 1
## [1] 7

## [1] 1
## [1] 8

## [1] 1
## [1] 9

## [1] 1
## [1] 10

## [1] 1
## [1] 11

## [1] 1
## [1] 12

## [1] 2
## [1] 3

## [1] 2
## [1] 4

## [1] 2
## [1] 5

## [1] 2
## [1] 6

## [1] 2
## [1] 7

## [1] 2
## [1] 8

## [1] 2
## [1] 9

## [1] 2
## [1] 10

## [1] 2
## [1] 11

## [1] 2
## [1] 12

## [1] 3
## [1] 4

## [1] 3
## [1] 5

## [1] 3
## [1] 6

## [1] 3
## [1] 7

## [1] 3
## [1] 8

## [1] 3
## [1] 9

## [1] 3
## [1] 10

## [1] 3
## [1] 11

## [1] 3
## [1] 12

## [1] 4
## [1] 5

## [1] 4
## [1] 6

## [1] 4
## [1] 7

## [1] 4
## [1] 8

## [1] 4
## [1] 9

## [1] 4
## [1] 10

## [1] 4
## [1] 11

## [1] 4
## [1] 12

## [1] 5
## [1] 6

## [1] 5
## [1] 7

## [1] 5
## [1] 8

## [1] 5
## [1] 9

## [1] 5
## [1] 10

## [1] 5
## [1] 11

## [1] 5
## [1] 12

## [1] 6
## [1] 7

## [1] 6
## [1] 8

## [1] 6
## [1] 9

## [1] 6
## [1] 10

## [1] 6
## [1] 11

## [1] 6
## [1] 12

## [1] 7
## [1] 8

## [1] 7
## [1] 9

## [1] 7
## [1] 10

## [1] 7
## [1] 11

## [1] 7
## [1] 12

## [1] 8
## [1] 9

## [1] 8
## [1] 10

## [1] 8
## [1] 11

## [1] 8
## [1] 12

## [1] 9
## [1] 10

## [1] 9
## [1] 11

## [1] 9
## [1] 12

## [1] 10
## [1] 11

## [1] 10
## [1] 12

## [1] 11
## [1] 12

3.1 KNN Algorithm

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
wineknn<- data.frame(wine, qualityvariable)
wineknn <- wineknn[, -13]
wineknn <- wineknn[, -1]
head(wineknn)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181           6.5            0.240        0.38            1.0     0.027
## 5017           8.8            0.550        0.04            2.2     0.119
## 2875           5.4            0.230        0.36            1.5     0.030
## 6442          11.1            0.440        0.42            2.2     0.064
## 1301           9.0            0.245        0.38            5.9     0.045
## 1486           7.4            0.280        0.49            1.5     0.034
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 5017                  14                   56 0.99620 3.21      0.60    10.9
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 6442                  14                   19 0.99758 3.25      0.57    10.4
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
##      qualityvariable
## 3181            high
## 5017            high
## 2875            high
## 6442            high
## 1301            high
## 1486            high
scaleddata <- as.data.frame(scale(wineknn[1:11], center = TRUE, scale = TRUE))
head(scaleddata)
##      fixed.acidity volatile.acidity citric.acid residual.sugar  chlorides
## 3181    -0.5419372       -0.6189051   0.4179584     -0.8996256 -0.8054037
## 5017     1.2009219        1.2236103  -1.8924939     -0.6329697  1.6903024
## 2875    -1.3754784       -0.6783410   0.2820494     -0.7885190 -0.7240220
## 6442     2.9437810        0.5698145   0.6897763     -0.6329697  0.1983042
## 1301     1.3524749       -0.5891871   0.4179584      0.1892197 -0.3171134
## 1486     0.1400512       -0.3811611   1.1654576     -0.7885190 -0.6155130
##      free.sulfur.dioxide total.sulfur.dioxide    density          pH  sulphates
## 3181          0.05410522           -0.4246473 -1.7788872  0.09562628 -1.1576984
## 5017         -0.90068036           -1.0235107  0.5613550 -0.09143040  0.4450484
## 2875          2.46915110            0.1213751 -1.6102818  0.09562628  3.0495120
## 6442         -0.90068036           -1.6752149  1.0267058  0.15797850  0.2447051
## 1301          1.23354623            0.7906929  0.1567022 -1.83729267 -1.2244795
## 1486         -0.56369721            0.2094432 -0.9223720 -1.52553155 -0.9573550
##          alcohol
## 3181  1.47627132
## 5017  0.29576611
## 2875  1.30762772
## 6442 -0.12584289
## 1301 -0.29448649
## 1486  0.04280071
wine_sample <- sample(2, nrow(scaleddata), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- scaleddata[wine_sample==1, ]
wine_test <- scaleddata[wine_sample==2, ]
nrow(wine_test)
## [1] 1741
wine.trainLabels <- wineknn[wine_sample==1, 12]

wine.testLabels <- wineknn[wine_sample==2, 12]
length(wine.testLabels)
## [1] 1741
library(class)
library(caret)
## Loading required package: lattice
nrow(wine_training)
## [1] 3579
length(wine.trainLabels)
## [1] 3579
wine_model_pred <- knn(train = wine_training, test = wine_test, cl=wine.trainLabels, k=19)
library(gmodels)
crosst <- CrossTable(wine.testLabels, wine_model_pred, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1741 
## 
##  
##                 | wine_model_pred 
## wine.testLabels |      high |       low | Row Total | 
## ----------------|-----------|-----------|-----------|
##            high |       920 |       164 |      1084 | 
##                 |     0.849 |     0.151 |     0.623 | 
##                 |     0.767 |     0.303 |           | 
##                 |     0.528 |     0.094 |           | 
## ----------------|-----------|-----------|-----------|
##             low |       280 |       377 |       657 | 
##                 |     0.426 |     0.574 |     0.377 | 
##                 |     0.233 |     0.697 |           | 
##                 |     0.161 |     0.217 |           | 
## ----------------|-----------|-----------|-----------|
##    Column Total |      1200 |       541 |      1741 | 
##                 |     0.689 |     0.311 |           | 
## ----------------|-----------|-----------|-----------|
## 
## 
crosst
## $t
##       y
## x      high low
##   high  920 164
##   low   280 377
## 
## $prop.row
##       y
## x           high       low
##   high 0.8487085 0.1512915
##   low  0.4261796 0.5738204
## 
## $prop.col
##       y
## x           high       low
##   high 0.7666667 0.3031423
##   low  0.2333333 0.6968577
## 
## $prop.tbl
##       y
## x            high        low
##   high 0.52843194 0.09419874
##   low  0.16082711 0.21654222
cm = confusionMatrix(wine_model_pred, reference = as.factor(wine.testLabels) ) # from caret library

print( paste("Total Accuracy = ", cm$overall['Accuracy'] ) )
## [1] "Total Accuracy =  0.744974152785755"
chooseK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k) #,                #<- number of neighbors considered
                  # use.all = TRUE)       #<- control ties between class assignments. If true, all distances equal to the k-th largest are included
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

# The sapply() function plugs in several values into our chooseK function.
# function(x)[function] allows you to apply a series of numbers
# to a function without running a for() loop.
knn_different_k = sapply(seq(1, 30, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = wine_training,
                                             val_set =wine_test,
                                             train_class =wine.trainLabels,
                                             val_class =wine.testLabels ))

# Reformat the results to graph the results.
str(knn_different_k)
##  num [1:2, 1:15] 1 0.674 3 0.706 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])
knn_different_k
##     k  accuracy
## 1   1 0.6743251
## 2   3 0.7064905
## 3   5 0.7237220
## 4   7 0.7248708
## 5   9 0.7248708
## 6  11 0.7288914
## 7  13 0.7225732
## 8  15 0.7340609
## 9  17 0.7409535
## 10 19 0.7449742
## 11 21 0.7415279
## 12 23 0.7432510
## 13 25 0.7357840
## 14 27 0.7369328
## 15 29 0.7409535
# Plot accuracy vs. k.
# install.packages("ggplot2")

ggplot(knn_different_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", linewidth = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")

3.2 Decision Tree

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
winedc<- data.frame(wine, qualityvariable)
table(winedc$qualityvariable)
## 
## high  low 
## 3332 1988
winedc <- winedc[, -13]
winedc <- winedc[, -1]
wine_sample_for_dc <- sample(2, nrow(winedc), replace=TRUE, prob=c(0.67, 0.33))
wine_training_for_dc <- winedc[wine_sample_for_dc==1, ]
nrow(wine_training_for_dc)
## [1] 3579
wine_test_for_dc <- winedc[wine_sample_for_dc==2, ]
nrow(wine_test_for_dc)
## [1] 1741
library(rpart)
library(rpart.plot)
control <- rpart.control(minsplit = 5L, maxdepth = 5L, minbucket = 5,cp=0.002, maxsurrogate = 4)
modeldc <-  rpart(qualityvariable~., wine_training_for_dc, method = "class", control = control)
summary(modeldc)
## Call:
## rpart(formula = qualityvariable ~ ., data = wine_training_for_dc, 
##     method = "class", control = control)
##   n= 3579 
## 
##             CP nsplit rel error    xerror       xstd
## 1  0.155522164      0 1.0000000 1.0000000 0.02172342
## 2  0.141247183      1 0.8444778 0.8572502 0.02094598
## 3  0.008640120      2 0.7032307 0.7280240 0.01997208
## 4  0.007513148      5 0.6754320 0.7017280 0.01973910
## 5  0.006260957      9 0.6416228 0.6979715 0.01970479
## 6  0.003756574     12 0.6228400 0.6919609 0.01964936
## 7  0.003005259     13 0.6190834 0.6934636 0.01966328
## 8  0.002629602     16 0.6100676 0.7009767 0.01973226
## 9  0.002253944     20 0.5995492 0.6972201 0.01969790
## 10 0.002000000     22 0.5950413 0.6972201 0.01969790
## 
## Variable importance
##              alcohol              density            chlorides 
##                   28                   15                   13 
##     volatile.acidity total.sulfur.dioxide       residual.sugar 
##                   13                   10                    9 
##  free.sulfur.dioxide            sulphates          citric.acid 
##                    6                    3                    1 
##        fixed.acidity                   pH 
##                    1                    1 
## 
## Node number 1: 3579 observations,    complexity param=0.1555222
##   predicted class=high  expected loss=0.3718916  P(node) =1
##     class counts:  2248  1331
##    probabilities: 0.628 0.372 
##   left son=2 (1832 obs) right son=3 (1747 obs)
##   Primary splits:
##       alcohol          < 10.35    to the right, improve=239.59600, (0 missing)
##       density          < 0.99285  to the left,  improve=134.90020, (0 missing)
##       volatile.acidity < 0.4575   to the left,  improve= 90.81570, (0 missing)
##       chlorides        < 0.0495   to the left,  improve= 85.99684, (0 missing)
##       citric.acid      < 0.235    to the right, improve= 70.23665, (0 missing)
##   Surrogate splits:
##       density              < 0.993395 to the left,  agree=0.760, adj=0.508, (0 split)
##       chlorides            < 0.0415   to the left,  agree=0.677, adj=0.339, (0 split)
##       total.sulfur.dioxide < 139.5    to the left,  agree=0.638, adj=0.258, (0 split)
##       residual.sugar       < 6.975    to the left,  agree=0.626, adj=0.234, (0 split)
## 
## Node number 2: 1832 observations,    complexity param=0.007513148
##   predicted class=high  expected loss=0.1932314  P(node) =0.5118748
##     class counts:  1478   354
##    probabilities: 0.807 0.193 
##   left son=4 (668 obs) right son=5 (1164 obs)
##   Primary splits:
##       alcohol             < 11.71667 to the right, improve=31.74577, (0 missing)
##       free.sulfur.dioxide < 11.5     to the right, improve=24.90616, (0 missing)
##       volatile.acidity    < 0.5475   to the left,  improve=18.42472, (0 missing)
##       density             < 0.99167  to the left,  improve=17.23243, (0 missing)
##       citric.acid         < 0.255    to the right, improve=13.60497, (0 missing)
##   Surrogate splits:
##       density       < 0.99087  to the left,  agree=0.776, adj=0.386, (0 split)
##       chlorides     < 0.0335   to the left,  agree=0.673, adj=0.103, (0 split)
##       fixed.acidity < 5.45     to the left,  agree=0.652, adj=0.046, (0 split)
##       sulphates     < 0.365    to the left,  agree=0.648, adj=0.036, (0 split)
## 
## Node number 3: 1747 observations,    complexity param=0.1412472
##   predicted class=low   expected loss=0.4407556  P(node) =0.4881252
##     class counts:   770   977
##    probabilities: 0.441 0.559 
##   left son=6 (694 obs) right son=7 (1053 obs)
##   Primary splits:
##       volatile.acidity    < 0.275    to the left,  improve=87.28633, (0 missing)
##       citric.acid         < 0.235    to the right, improve=25.28865, (0 missing)
##       chlorides           < 0.0595   to the left,  improve=23.97400, (0 missing)
##       alcohol             < 9.85     to the right, improve=22.56507, (0 missing)
##       free.sulfur.dioxide < 24.5     to the right, improve=10.83186, (0 missing)
##   Surrogate splits:
##       chlorides           < 0.0595   to the left,  agree=0.693, adj=0.226, (0 split)
##       free.sulfur.dioxide < 36.5     to the right, agree=0.657, adj=0.135, (0 split)
##       residual.sugar      < 7.85     to the right, agree=0.641, adj=0.095, (0 split)
##       sulphates           < 0.445    to the left,  agree=0.638, adj=0.089, (0 split)
## 
## Node number 4: 668 observations
##   predicted class=high  expected loss=0.07035928  P(node) =0.1866443
##     class counts:   621    47
##    probabilities: 0.930 0.070 
## 
## Node number 5: 1164 observations,    complexity param=0.007513148
##   predicted class=high  expected loss=0.2637457  P(node) =0.3252305
##     class counts:   857   307
##    probabilities: 0.736 0.264 
##   left son=10 (848 obs) right son=11 (316 obs)
##   Primary splits:
##       free.sulfur.dioxide < 16.5     to the right, improve=23.181820, (0 missing)
##       volatile.acidity    < 0.4025   to the left,  improve=16.839020, (0 missing)
##       citric.acid         < 0.255    to the right, improve= 9.986410, (0 missing)
##       residual.sugar      < 0.975    to the right, improve= 7.686083, (0 missing)
##       sulphates           < 0.635    to the right, improve= 7.114477, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 33.5     to the right, agree=0.851, adj=0.453, (0 split)
##       chlorides            < 0.0605   to the left,  agree=0.771, adj=0.158, (0 split)
##       fixed.acidity        < 8.55     to the left,  agree=0.765, adj=0.133, (0 split)
##       volatile.acidity     < 0.4425   to the left,  agree=0.754, adj=0.095, (0 split)
## 
## Node number 6: 694 observations,    complexity param=0.00864012
##   predicted class=high  expected loss=0.3645533  P(node) =0.1939089
##     class counts:   441   253
##    probabilities: 0.635 0.365 
##   left son=12 (361 obs) right son=13 (333 obs)
##   Primary splits:
##       volatile.acidity    < 0.2275   to the left,  improve=13.825610, (0 missing)
##       alcohol             < 9.55     to the right, improve= 7.818614, (0 missing)
##       residual.sugar      < 17.65    to the left,  improve= 6.397418, (0 missing)
##       sulphates           < 0.485    to the right, improve= 5.381152, (0 missing)
##       free.sulfur.dioxide < 23.5     to the right, improve= 5.369760, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 153.5    to the left,  agree=0.641, adj=0.252, (0 split)
##       density              < 0.995215 to the left,  agree=0.620, adj=0.207, (0 split)
##       residual.sugar       < 7.15     to the left,  agree=0.610, adj=0.186, (0 split)
##       free.sulfur.dioxide  < 47.5     to the left,  agree=0.591, adj=0.147, (0 split)
## 
## Node number 7: 1053 observations,    complexity param=0.006260957
##   predicted class=low   expected loss=0.3124406  P(node) =0.2942163
##     class counts:   329   724
##    probabilities: 0.312 0.688 
##   left son=14 (261 obs) right son=15 (792 obs)
##   Primary splits:
##       alcohol              < 9.85     to the right, improve=8.837949, (0 missing)
##       sulphates            < 0.545    to the right, improve=7.746356, (0 missing)
##       volatile.acidity     < 0.555    to the left,  improve=7.021819, (0 missing)
##       fixed.acidity        < 10.05    to the right, improve=6.366498, (0 missing)
##       total.sulfur.dioxide < 51.5     to the left,  improve=6.361199, (0 missing)
##   Surrogate splits:
##       density          < 0.99273  to the left,  agree=0.762, adj=0.038, (0 split)
##       fixed.acidity    < 5.1      to the left,  agree=0.756, adj=0.015, (0 split)
##       pH               < 3.565    to the right, agree=0.756, adj=0.015, (0 split)
##       volatile.acidity < 0.99     to the right, agree=0.754, adj=0.008, (0 split)
## 
## Node number 10: 848 observations,    complexity param=0.003005259
##   predicted class=high  expected loss=0.2028302  P(node) =0.2369377
##     class counts:   676   172
##    probabilities: 0.797 0.203 
##   left son=20 (709 obs) right son=21 (139 obs)
##   Primary splits:
##       volatile.acidity < 0.375    to the left,  improve=8.183534, (0 missing)
##       residual.sugar   < 14.55    to the left,  improve=5.237579, (0 missing)
##       chlorides        < 0.0395   to the left,  improve=5.143133, (0 missing)
##       citric.acid      < 0.255    to the right, improve=4.576281, (0 missing)
##       alcohol          < 10.99    to the right, improve=4.322759, (0 missing)
##   Surrogate splits:
##       chlorides            < 0.0635   to the left,  agree=0.889, adj=0.324, (0 split)
##       citric.acid          < 0.155    to the right, agree=0.888, adj=0.317, (0 split)
##       total.sulfur.dioxide < 65.5     to the right, agree=0.881, adj=0.273, (0 split)
##       density              < 0.99721  to the left,  agree=0.857, adj=0.129, (0 split)
## 
## Node number 11: 316 observations,    complexity param=0.007513148
##   predicted class=high  expected loss=0.4272152  P(node) =0.08829282
##     class counts:   181   135
##    probabilities: 0.573 0.427 
##   left son=22 (156 obs) right son=23 (160 obs)
##   Primary splits:
##       sulphates            < 0.575    to the right, improve=17.977220, (0 missing)
##       chlorides            < 0.0405   to the right, improve= 8.420782, (0 missing)
##       residual.sugar       < 1.675    to the right, improve= 8.367262, (0 missing)
##       total.sulfur.dioxide < 36       to the left,  improve= 6.198053, (0 missing)
##       alcohol              < 11.15    to the right, improve= 5.308257, (0 missing)
##   Surrogate splits:
##       density              < 0.9949   to the right, agree=0.741, adj=0.474, (0 split)
##       chlorides            < 0.0595   to the right, agree=0.731, adj=0.455, (0 split)
##       total.sulfur.dioxide < 36       to the left,  agree=0.728, adj=0.449, (0 split)
##       citric.acid          < 0.395    to the right, agree=0.655, adj=0.301, (0 split)
## 
## Node number 12: 361 observations,    complexity param=0.002629602
##   predicted class=high  expected loss=0.2686981  P(node) =0.1008662
##     class counts:   264    97
##    probabilities: 0.731 0.269 
##   left son=24 (254 obs) right son=25 (107 obs)
##   Primary splits:
##       free.sulfur.dioxide < 26.5     to the right, improve=7.014382, (0 missing)
##       residual.sugar      < 5.45     to the right, improve=4.797638, (0 missing)
##       pH                  < 2.995    to the right, improve=3.981735, (0 missing)
##       fixed.acidity       < 9.15     to the left,  improve=3.931793, (0 missing)
##       chlorides           < 0.0335   to the right, improve=3.860880, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 103.5    to the right, agree=0.814, adj=0.374, (0 split)
##       fixed.acidity        < 8.95     to the left,  agree=0.723, adj=0.065, (0 split)
##       residual.sugar       < 1.35     to the right, agree=0.717, adj=0.047, (0 split)
##       density              < 0.99193  to the right, agree=0.715, adj=0.037, (0 split)
## 
## Node number 13: 333 observations,    complexity param=0.00864012
##   predicted class=high  expected loss=0.4684685  P(node) =0.09304275
##     class counts:   177   156
##    probabilities: 0.532 0.468 
##   left son=26 (272 obs) right son=27 (61 obs)
##   Primary splits:
##       alcohol     < 9.05     to the right, improve=7.232727, (0 missing)
##       chlorides   < 0.0495   to the left,  improve=6.230406, (0 missing)
##       citric.acid < 0.195    to the right, improve=5.214211, (0 missing)
##       pH          < 2.955    to the left,  improve=4.607654, (0 missing)
##       sulphates   < 0.475    to the right, improve=4.486647, (0 missing)
##   Surrogate splits:
##       residual.sugar       < 17.9     to the left,  agree=0.829, adj=0.066, (0 split)
##       density              < 0.999725 to the left,  agree=0.829, adj=0.066, (0 split)
##       total.sulfur.dioxide < 229      to the left,  agree=0.823, adj=0.033, (0 split)
## 
## Node number 14: 261 observations,    complexity param=0.006260957
##   predicted class=low   expected loss=0.4252874  P(node) =0.0729254
##     class counts:   111   150
##    probabilities: 0.425 0.575 
##   left son=28 (190 obs) right son=29 (71 obs)
##   Primary splits:
##       density          < 0.99467  to the right, improve=4.849958, (0 missing)
##       volatile.acidity < 0.555    to the left,  improve=4.788846, (0 missing)
##       sulphates        < 0.545    to the right, improve=3.824913, (0 missing)
##       pH               < 3.425    to the left,  improve=3.768881, (0 missing)
##       fixed.acidity    < 7.55     to the right, improve=3.652153, (0 missing)
##   Surrogate splits:
##       residual.sugar   < 1.65     to the right, agree=0.828, adj=0.366, (0 split)
##       chlorides        < 0.0475   to the right, agree=0.785, adj=0.211, (0 split)
##       fixed.acidity    < 5.75     to the right, agree=0.762, adj=0.127, (0 split)
##       volatile.acidity < 0.315    to the right, agree=0.762, adj=0.127, (0 split)
## 
## Node number 15: 792 observations,    complexity param=0.002629602
##   predicted class=low   expected loss=0.2752525  P(node) =0.2212909
##     class counts:   218   574
##    probabilities: 0.275 0.725 
##   left son=30 (434 obs) right son=31 (358 obs)
##   Primary splits:
##       volatile.acidity     < 0.4225   to the left,  improve=5.179714, (0 missing)
##       fixed.acidity        < 10.85    to the right, improve=4.327723, (0 missing)
##       sulphates            < 0.685    to the right, improve=4.186987, (0 missing)
##       density              < 1.00129  to the right, improve=2.771093, (0 missing)
##       total.sulfur.dioxide < 41.5     to the left,  improve=2.726417, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 111.5    to the right, agree=0.768, adj=0.486, (0 split)
##       chlorides            < 0.0635   to the left,  agree=0.749, adj=0.444, (0 split)
##       residual.sugar       < 4.55     to the right, agree=0.707, adj=0.352, (0 split)
##       free.sulfur.dioxide  < 21.5     to the right, agree=0.705, adj=0.346, (0 split)
## 
## Node number 20: 709 observations,    complexity param=0.002253944
##   predicted class=high  expected loss=0.1720733  P(node) =0.1981
##     class counts:   587   122
##    probabilities: 0.828 0.172 
##   left son=40 (700 obs) right son=41 (9 obs)
##   Primary splits:
##       residual.sugar < 14.55    to the left,  improve=4.459819, (0 missing)
##       pH             < 2.975    to the right, improve=4.071160, (0 missing)
##       sulphates      < 0.525    to the right, improve=4.016235, (0 missing)
##       chlorides      < 0.0385   to the left,  improve=2.880302, (0 missing)
##       fixed.acidity  < 7.05     to the left,  improve=2.665545, (0 missing)
## 
## Node number 21: 139 observations,    complexity param=0.003005259
##   predicted class=high  expected loss=0.3597122  P(node) =0.03883766
##     class counts:    89    50
##    probabilities: 0.640 0.360 
##   left son=42 (95 obs) right son=43 (44 obs)
##   Primary splits:
##       total.sulfur.dioxide < 123      to the left,  improve=6.882365, (0 missing)
##       residual.sugar       < 5.45     to the left,  improve=4.293256, (0 missing)
##       alcohol              < 10.65    to the right, improve=4.250181, (0 missing)
##       chlorides            < 0.0535   to the right, improve=4.228158, (0 missing)
##       fixed.acidity        < 7.45     to the right, improve=3.966908, (0 missing)
##   Surrogate splits:
##       free.sulfur.dioxide < 38.5     to the left,  agree=0.784, adj=0.318, (0 split)
##       residual.sugar      < 4.125    to the left,  agree=0.777, adj=0.295, (0 split)
##       chlorides           < 0.063    to the right, agree=0.763, adj=0.250, (0 split)
##       sulphates           < 0.515    to the right, agree=0.734, adj=0.159, (0 split)
## 
## Node number 22: 156 observations,    complexity param=0.003005259
##   predicted class=high  expected loss=0.2564103  P(node) =0.04358759
##     class counts:   116    40
##    probabilities: 0.744 0.256 
##   left son=44 (142 obs) right son=45 (14 obs)
##   Primary splits:
##       residual.sugar       < 1.55     to the right, improve=4.593819, (0 missing)
##       total.sulfur.dioxide < 76.5     to the left,  improve=4.136950, (0 missing)
##       alcohol              < 11.25    to the right, improve=3.214031, (0 missing)
##       chlorides            < 0.0425   to the right, improve=3.152181, (0 missing)
##       pH                   < 3.085    to the right, improve=3.152181, (0 missing)
##   Surrogate splits:
##       density          < 0.992005 to the right, agree=0.942, adj=0.357, (0 split)
##       chlorides        < 0.0365   to the right, agree=0.936, adj=0.286, (0 split)
##       volatile.acidity < 0.185    to the right, agree=0.917, adj=0.071, (0 split)
## 
## Node number 23: 160 observations,    complexity param=0.007513148
##   predicted class=low   expected loss=0.40625  P(node) =0.04470522
##     class counts:    65    95
##    probabilities: 0.406 0.594 
##   left son=46 (75 obs) right son=47 (85 obs)
##   Primary splits:
##       free.sulfur.dioxide < 11.5     to the right, improve=10.599260, (0 missing)
##       volatile.acidity    < 0.345    to the left,  improve= 6.201674, (0 missing)
##       citric.acid         < 0.265    to the right, improve= 4.206731, (0 missing)
##       pH                  < 3.365    to the left,  improve= 4.167787, (0 missing)
##       density             < 0.99571  to the left,  improve= 2.812500, (0 missing)
##   Surrogate splits:
##       volatile.acidity     < 0.2975   to the left,  agree=0.662, adj=0.280, (0 split)
##       total.sulfur.dioxide < 29       to the right, agree=0.656, adj=0.267, (0 split)
##       pH                   < 3.035    to the left,  agree=0.613, adj=0.173, (0 split)
##       citric.acid          < 0.245    to the right, agree=0.581, adj=0.107, (0 split)
## 
## Node number 24: 254 observations,    complexity param=0.002253944
##   predicted class=high  expected loss=0.2047244  P(node) =0.07096954
##     class counts:   202    52
##    probabilities: 0.795 0.205 
##   left son=48 (249 obs) right son=49 (5 obs)
##   Primary splits:
##       pH                   < 2.88     to the right, improve=3.614686, (0 missing)
##       citric.acid          < 0.685    to the left,  improve=2.296870, (0 missing)
##       sulphates            < 0.555    to the right, improve=2.135927, (0 missing)
##       alcohol              < 9.45     to the right, improve=1.868311, (0 missing)
##       total.sulfur.dioxide < 123.5    to the left,  improve=1.816450, (0 missing)
## 
## Node number 25: 107 observations,    complexity param=0.002629602
##   predicted class=high  expected loss=0.4205607  P(node) =0.02989662
##     class counts:    62    45
##    probabilities: 0.579 0.421 
##   left son=50 (40 obs) right son=51 (67 obs)
##   Primary splits:
##       residual.sugar       < 2.95     to the right, improve=6.215204, (0 missing)
##       fixed.acidity        < 6.85     to the left,  improve=5.831802, (0 missing)
##       density              < 0.9963   to the right, improve=3.926798, (0 missing)
##       chlorides            < 0.031    to the right, improve=2.855247, (0 missing)
##       total.sulfur.dioxide < 99.5     to the left,  improve=2.179469, (0 missing)
##   Surrogate splits:
##       density             < 0.99421  to the right, agree=0.776, adj=0.400, (0 split)
##       free.sulfur.dioxide < 24.5     to the right, agree=0.738, adj=0.300, (0 split)
##       alcohol             < 9.65     to the left,  agree=0.692, adj=0.175, (0 split)
##       chlorides           < 0.0495   to the right, agree=0.664, adj=0.100, (0 split)
## 
## Node number 26: 272 observations,    complexity param=0.00864012
##   predicted class=high  expected loss=0.4191176  P(node) =0.07599888
##     class counts:   158   114
##    probabilities: 0.581 0.419 
##   left son=52 (164 obs) right son=53 (108 obs)
##   Primary splits:
##       chlorides            < 0.0495   to the left,  improve=7.604681, (0 missing)
##       citric.acid          < 0.195    to the right, improve=4.331373, (0 missing)
##       total.sulfur.dioxide < 162.5    to the left,  improve=4.199836, (0 missing)
##       sulphates            < 0.485    to the right, improve=4.196369, (0 missing)
##       alcohol              < 9.85     to the right, improve=3.544510, (0 missing)
##   Surrogate splits:
##       free.sulfur.dioxide  < 14.5     to the right, agree=0.654, adj=0.130, (0 split)
##       total.sulfur.dioxide < 107.5    to the right, agree=0.647, adj=0.111, (0 split)
##       density              < 0.99745  to the left,  agree=0.643, adj=0.102, (0 split)
##       residual.sugar       < 4.85     to the right, agree=0.640, adj=0.093, (0 split)
## 
## Node number 27: 61 observations,    complexity param=0.003756574
##   predicted class=low   expected loss=0.3114754  P(node) =0.01704387
##     class counts:    19    42
##    probabilities: 0.311 0.689 
##   left son=54 (5 obs) right son=55 (56 obs)
##   Primary splits:
##       pH                   < 2.975    to the left,  improve=5.163934, (0 missing)
##       citric.acid          < 0.425    to the left,  improve=3.697769, (0 missing)
##       residual.sugar       < 13.05    to the right, improve=3.243676, (0 missing)
##       total.sulfur.dioxide < 154.5    to the right, improve=2.346892, (0 missing)
##       chlorides            < 0.0575   to the right, improve=2.208152, (0 missing)
## 
## Node number 28: 190 observations,    complexity param=0.006260957
##   predicted class=low   expected loss=0.4842105  P(node) =0.05308745
##     class counts:    92    98
##    probabilities: 0.484 0.516 
##   left son=56 (107 obs) right son=57 (83 obs)
##   Primary splits:
##       volatile.acidity < 0.555    to the left,  improve=8.614981, (0 missing)
##       pH               < 3.425    to the left,  improve=5.423459, (0 missing)
##       citric.acid      < 0.035    to the right, improve=3.162573, (0 missing)
##       density          < 0.99496  to the left,  improve=3.094336, (0 missing)
##       sulphates        < 0.725    to the right, improve=2.207081, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.135    to the right, agree=0.732, adj=0.386, (0 split)
##       chlorides            < 0.0735   to the left,  agree=0.726, adj=0.373, (0 split)
##       residual.sugar       < 6.15     to the right, agree=0.679, adj=0.265, (0 split)
##       total.sulfur.dioxide < 93.5     to the right, agree=0.663, adj=0.229, (0 split)
## 
## Node number 29: 71 observations
##   predicted class=low   expected loss=0.2676056  P(node) =0.01983794
##     class counts:    19    52
##    probabilities: 0.268 0.732 
## 
## Node number 30: 434 observations,    complexity param=0.002629602
##   predicted class=low   expected loss=0.3271889  P(node) =0.1212629
##     class counts:   142   292
##    probabilities: 0.327 0.673 
##   left son=60 (19 obs) right son=61 (415 obs)
##   Primary splits:
##       fixed.acidity        < 10.45    to the right, improve=5.065405, (0 missing)
##       total.sulfur.dioxide < 60.5     to the left,  improve=4.571476, (0 missing)
##       sulphates            < 0.675    to the right, improve=4.542000, (0 missing)
##       alcohol              < 9.516667 to the right, improve=3.625194, (0 missing)
##       citric.acid          < 0.255    to the right, improve=3.343132, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 18.5     to the left,  agree=0.965, adj=0.211, (0 split)
## 
## Node number 31: 358 observations
##   predicted class=low   expected loss=0.2122905  P(node) =0.1000279
##     class counts:    76   282
##    probabilities: 0.212 0.788 
## 
## Node number 40: 700 observations
##   predicted class=high  expected loss=0.1657143  P(node) =0.1955854
##     class counts:   584   116
##    probabilities: 0.834 0.166 
## 
## Node number 41: 9 observations
##   predicted class=low   expected loss=0.3333333  P(node) =0.002514669
##     class counts:     3     6
##    probabilities: 0.333 0.667 
## 
## Node number 42: 95 observations
##   predicted class=high  expected loss=0.2526316  P(node) =0.02654373
##     class counts:    71    24
##    probabilities: 0.747 0.253 
## 
## Node number 43: 44 observations
##   predicted class=low   expected loss=0.4090909  P(node) =0.01229394
##     class counts:    18    26
##    probabilities: 0.409 0.591 
## 
## Node number 44: 142 observations
##   predicted class=high  expected loss=0.2183099  P(node) =0.03967589
##     class counts:   111    31
##    probabilities: 0.782 0.218 
## 
## Node number 45: 14 observations
##   predicted class=low   expected loss=0.3571429  P(node) =0.003911707
##     class counts:     5     9
##    probabilities: 0.357 0.643 
## 
## Node number 46: 75 observations
##   predicted class=high  expected loss=0.4  P(node) =0.02095557
##     class counts:    45    30
##    probabilities: 0.600 0.400 
## 
## Node number 47: 85 observations
##   predicted class=low   expected loss=0.2352941  P(node) =0.02374965
##     class counts:    20    65
##    probabilities: 0.235 0.765 
## 
## Node number 48: 249 observations
##   predicted class=high  expected loss=0.1927711  P(node) =0.06957251
##     class counts:   201    48
##    probabilities: 0.807 0.193 
## 
## Node number 49: 5 observations
##   predicted class=low   expected loss=0.2  P(node) =0.001397038
##     class counts:     1     4
##    probabilities: 0.200 0.800 
## 
## Node number 50: 40 observations
##   predicted class=high  expected loss=0.2  P(node) =0.01117631
##     class counts:    32     8
##    probabilities: 0.800 0.200 
## 
## Node number 51: 67 observations
##   predicted class=low   expected loss=0.4477612  P(node) =0.01872031
##     class counts:    30    37
##    probabilities: 0.448 0.552 
## 
## Node number 52: 164 observations
##   predicted class=high  expected loss=0.3231707  P(node) =0.04582286
##     class counts:   111    53
##    probabilities: 0.677 0.323 
## 
## Node number 53: 108 observations
##   predicted class=low   expected loss=0.4351852  P(node) =0.03017603
##     class counts:    47    61
##    probabilities: 0.435 0.565 
## 
## Node number 54: 5 observations
##   predicted class=high  expected loss=0  P(node) =0.001397038
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 55: 56 observations
##   predicted class=low   expected loss=0.25  P(node) =0.01564683
##     class counts:    14    42
##    probabilities: 0.250 0.750 
## 
## Node number 56: 107 observations
##   predicted class=high  expected loss=0.3831776  P(node) =0.02989662
##     class counts:    66    41
##    probabilities: 0.617 0.383 
## 
## Node number 57: 83 observations
##   predicted class=low   expected loss=0.313253  P(node) =0.02319084
##     class counts:    26    57
##    probabilities: 0.313 0.687 
## 
## Node number 60: 19 observations
##   predicted class=high  expected loss=0.3157895  P(node) =0.005308745
##     class counts:    13     6
##    probabilities: 0.684 0.316 
## 
## Node number 61: 415 observations
##   predicted class=low   expected loss=0.3108434  P(node) =0.1159542
##     class counts:   129   286
##    probabilities: 0.311 0.689
predict_rpart <- predict(modeldc, wine_test_for_dc[, -13], type = "class")

prp(modeldc, type=2, extra=3, tweak=0.8, main = "The Quality of Wine", compress=TRUE)

plot(modeldc, uniform=TRUE, main="Classification Tree for Kyphosis")
text(modeldc, use.n=TRUE, all=TRUE, cex=.8)

rpart.plot(modeldc)

library(caret)


cm = confusionMatrix(predict_rpart, as.factor(wine_test_for_dc$qualityvariable))
print('Overall: ')
## [1] "Overall: "
cm$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   7.300402e-01   4.217100e-01   7.085237e-01   7.507786e-01   6.226307e-01 
## AccuracyPValue  McnemarPValue 
##   1.826395e-21   3.327160e-01
print('Class: ')
## [1] "Class: "
cm$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.7933579            0.6255708            0.7775769 
##       Neg Pred Value            Precision               Recall 
##            0.6472441            0.7775769            0.7933579 
##                   F1           Prevalence       Detection Rate 
##            0.7853881            0.6226307            0.4939690 
## Detection Prevalence    Balanced Accuracy 
##            0.6352671            0.7094644
#loadPkg("rpart")
#loadPkg("caret")


confusionMatrixResultDf = data.frame( Depth=numeric(0), Accuracy= numeric(0), Sensitivity=numeric(0), Specificity=numeric(0), Pos.Pred.Value=numeric(0), Neg.Pred.Value=numeric(0), Precision=numeric(0), Recall=numeric(0), F1=numeric(0), Prevalence=numeric(0), Detection.Rate=numeric(0), Detection.Prevalence=numeric(0), Balanced.Accuracy=numeric(0), row.names = NULL )

for (deep in 2:20) {
   models <- rpart(qualityvariable~ ., wine_training_for_dc, method = "class",control = list(maxdepth = deep), cp = 0.002)
preds <- predict(models, wine_test_for_dc[, -13], type = "class")
cm = confusionMatrix(preds, as.factor(wine_test_for_dc$qualityvariable)) # from caret library
  # 
  cmaccu = cm$overall['Accuracy']
  # print( paste("Total Accuracy = ", cmaccu ) )
  # 
  cmt = data.frame(Depth=deep, Accuracy = cmaccu, row.names = NULL ) # initialize a row of the metrics 
  cmt = cbind( cmt, data.frame( t(cm$byClass) ) ) # the dataframe of the transpose, with k valued added in front
  confusionMatrixResultDf = rbind(confusionMatrixResultDf, cmt)
  # print("Other metrics : ")

}
print(confusionMatrixResultDf)
##    Depth  Accuracy Sensitivity Specificity Pos.Pred.Value Neg.Pred.Value
## 1      2 0.7156806   0.8348708   0.5190259      0.7411957      0.6557692
## 2      3 0.7156806   0.8348708   0.5190259      0.7411957      0.6557692
## 3      4 0.7260195   0.8035055   0.5981735      0.7674009      0.6485149
## 4      5 0.7306146   0.7933579   0.6270928      0.7782805      0.6477987
## 5      6 0.7306146   0.8219557   0.5799087      0.7634961      0.6637631
## 6      7 0.7260195   0.8035055   0.5981735      0.7674009      0.6485149
## 7      8 0.7306146   0.8145756   0.5920852      0.7671590      0.6593220
## 8      9 0.7277427   0.8274908   0.5631659      0.7576014      0.6642729
## 9     10 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 10    11 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 11    12 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 12    13 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 13    14 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 14    15 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 15    16 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 16    17 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 17    18 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 18    19 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
## 19    20 0.7254451   0.8311808   0.5509893      0.7533445      0.6642202
##    Precision    Recall        F1 Prevalence Detection.Rate Detection.Prevalence
## 1  0.7411957 0.8348708 0.7852495  0.6226307      0.5198162            0.7013211
## 2  0.7411957 0.8348708 0.7852495  0.6226307      0.5198162            0.7013211
## 3  0.7674009 0.8035055 0.7850383  0.6226307      0.5002872            0.6519242
## 4  0.7782805 0.7933579 0.7857469  0.6226307      0.4939690            0.6346927
## 5  0.7634961 0.8219557 0.7916482  0.6226307      0.5117748            0.6703044
## 6  0.7674009 0.8035055 0.7850383  0.6226307      0.5002872            0.6519242
## 7  0.7671590 0.8145756 0.7901566  0.6226307      0.5071798            0.6611143
## 8  0.7576014 0.8274908 0.7910053  0.6226307      0.5152211            0.6800689
## 9  0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 10 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 11 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 12 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 13 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 14 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 15 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 16 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 17 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 18 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
## 19 0.7533445 0.8311808 0.7903509  0.6226307      0.5175187            0.6869615
##    Balanced.Accuracy
## 1          0.6769484
## 2          0.6769484
## 3          0.7008395
## 4          0.7102254
## 5          0.7009322
## 6          0.7008395
## 7          0.7033304
## 8          0.6953283
## 9          0.6910851
## 10         0.6910851
## 11         0.6910851
## 12         0.6910851
## 13         0.6910851
## 14         0.6910851
## 15         0.6910851
## 16         0.6910851
## 17         0.6910851
## 18         0.6910851
## 19         0.6910851
library(rpart.plot)
rpart.plot(modeldc)

plotcp(modeldc)

prqualityfit <- prune(modeldc, cp = modeldc$cptable[which.min(modeldc$cptable[,"xerror"]),"CP"] )
# Compute the accuracy of the pruned tree
pred<- predict(prqualityfit, wine_test_for_dc[,-13], type = "class")
accuracy_prun <- mean(pred == as.factor(wine_test_for_dc$qualityvariable))
data.frame( accuracy_prun)
##   accuracy_prun
## 1     0.7403791
rpart.plot(prqualityfit)

3.3 Logistic Regression

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, 1, 0)
winelogit<- data.frame(wine, qualityvariable)

winelogit <- winelogit[, -c(1,13)]
#winelogit <- winelogit[, -1]
head(winelogit)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181           6.5            0.240        0.38            1.0     0.027
## 5017           8.8            0.550        0.04            2.2     0.119
## 2875           5.4            0.230        0.36            1.5     0.030
## 6442          11.1            0.440        0.42            2.2     0.064
## 1301           9.0            0.245        0.38            5.9     0.045
## 1486           7.4            0.280        0.49            1.5     0.034
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 5017                  14                   56 0.99620 3.21      0.60    10.9
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 6442                  14                   19 0.99758 3.25      0.57    10.4
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
##      qualityvariable
## 3181               1
## 5017               1
## 2875               1
## 6442               1
## 1301               1
## 1486               1
wine_sample <- sample(2, nrow(winelogit), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- winelogit[wine_sample==1, ]
head(wine_training)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181           6.5            0.240        0.38            1.0     0.027
## 2875           5.4            0.230        0.36            1.5     0.030
## 1301           9.0            0.245        0.38            5.9     0.045
## 1486           7.4            0.280        0.49            1.5     0.034
## 2522           6.5            0.180        0.33            1.4     0.029
## 6415           6.1            0.320        0.25            2.3     0.071
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
## 2522                  35                  138 0.99114 3.36      0.60    11.5
## 6415                  23                   58 0.99633 3.42      0.97    10.6
##      qualityvariable
## 3181               1
## 2875               1
## 1301               1
## 1486               1
## 2522               1
## 6415               0
wine_test <- winelogit[wine_sample==2, ]
nrow(wine_test)
## [1] 1741
nrow(wine_training)
## [1] 3579
#head(winelogit)
#wine.trainLabels_logit <- winelogit[wine_sample==1, 12]
#length(wine.trainLabels_logit)

#wine.testLabels_logit <- winelogit[wine_sample==2, 12]
head(winelogit)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181           6.5            0.240        0.38            1.0     0.027
## 5017           8.8            0.550        0.04            2.2     0.119
## 2875           5.4            0.230        0.36            1.5     0.030
## 6442          11.1            0.440        0.42            2.2     0.064
## 1301           9.0            0.245        0.38            5.9     0.045
## 1486           7.4            0.280        0.49            1.5     0.034
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 5017                  14                   56 0.99620 3.21      0.60    10.9
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 6442                  14                   19 0.99758 3.25      0.57    10.4
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
##      qualityvariable
## 3181               1
## 5017               1
## 2875               1
## 6442               1
## 1301               1
## 1486               1
corrlogit = cor(wine_training[,-12])
corrlogit
##                      fixed.acidity volatile.acidity  citric.acid residual.sugar
## fixed.acidity            1.0000000       0.19671670  0.358316546     -0.1085421
## volatile.acidity         0.1967167       1.00000000 -0.384026118     -0.1603380
## citric.acid              0.3583165      -0.38402612  1.000000000      0.1489293
## residual.sugar          -0.1085421      -0.16033804  0.148929286      1.0000000
## chlorides                0.2800980       0.35776885  0.067716597     -0.1199718
## free.sulfur.dioxide     -0.2888181      -0.34165027  0.127150508      0.4233792
## total.sulfur.dioxide    -0.3269967      -0.40388082  0.188901853      0.4975264
## density                  0.4741359       0.30710155  0.108761858      0.5282271
## pH                      -0.2729064       0.26685772 -0.359684308     -0.2417892
## sulphates                0.3145093       0.22965406  0.075352942     -0.1710380
## alcohol                 -0.1023721      -0.06196878 -0.005754844     -0.3109122
##                        chlorides free.sulfur.dioxide total.sulfur.dioxide
## fixed.acidity         0.28009798         -0.28881809          -0.32699667
## volatile.acidity      0.35776885         -0.34165027          -0.40388082
## citric.acid           0.06771660          0.12715051           0.18890185
## residual.sugar       -0.11997177          0.42337919           0.49752645
## chlorides             1.00000000         -0.18672133          -0.26844202
## free.sulfur.dioxide  -0.18672133          1.00000000           0.72691159
## total.sulfur.dioxide -0.26844202          0.72691159           1.00000000
## density               0.36326534          0.01793178           0.01076784
## pH                    0.02438048         -0.15346995          -0.22802001
## sulphates             0.41322151         -0.21377492          -0.29265329
## alcohol              -0.26190284         -0.18240184          -0.25132571
##                          density          pH   sulphates      alcohol
## fixed.acidity         0.47413591 -0.27290636  0.31450933 -0.102372143
## volatile.acidity      0.30710155  0.26685772  0.22965406 -0.061968778
## citric.acid           0.10876186 -0.35968431  0.07535294 -0.005754844
## residual.sugar        0.52822707 -0.24178925 -0.17103796 -0.310912227
## chlorides             0.36326534  0.02438048  0.41322151 -0.261902845
## free.sulfur.dioxide   0.01793178 -0.15346995 -0.21377492 -0.182401841
## total.sulfur.dioxide  0.01076784 -0.22802001 -0.29265329 -0.251325707
## density               1.00000000  0.02424847  0.28498557 -0.662449545
## pH                    0.02424847  1.00000000  0.15973535  0.105306308
## sulphates             0.28498557  0.15973535  1.00000000 -0.013009676
## alcohol              -0.66244955  0.10530631 -0.01300968  1.000000000
library(corrplot)
corrplot(corrlogit,type="lower", method = "square")

logitmodel <-  glm(qualityvariable ~ volatile.acidity + citric.acid + 
    residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol , data = wine_training, family= "binomial")
summary(logitmodel)
## 
## Call:
## glm(formula = qualityvariable ~ volatile.acidity + citric.acid + 
##     residual.sugar + free.sulfur.dioxide + pH + sulphates + alcohol, 
##     family = "binomial", data = wine_training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2491  -0.9119   0.4425   0.8082   2.6592  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -12.763016   1.057595 -12.068  < 2e-16 ***
## volatile.acidity     -4.044571   0.300817 -13.445  < 2e-16 ***
## citric.acid          -0.205288   0.311280  -0.659 0.509577    
## residual.sugar        0.031250   0.009970   3.135 0.001721 ** 
## free.sulfur.dioxide   0.007085   0.002679   2.645 0.008177 ** 
## pH                    0.975925   0.287548   3.394 0.000689 ***
## sulphates             2.264890   0.318640   7.108 1.18e-12 ***
## alcohol               0.968095   0.044666  21.674  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4724  on 3578  degrees of freedom
## Residual deviance: 3719  on 3571  degrees of freedom
## AIC: 3735
## 
## Number of Fisher Scoring iterations: 4
expcoeff = exp(coef(logitmodel))
expcoeff
##         (Intercept)    volatile.acidity         citric.acid      residual.sugar 
##        2.864789e-06        1.751722e-02        8.144129e-01        1.031744e+00 
## free.sulfur.dioxide                  pH           sulphates             alcohol 
##        1.007110e+00        2.653620e+00        9.630068e+00        2.632925e+00
head(wine_training)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181           6.5            0.240        0.38            1.0     0.027
## 2875           5.4            0.230        0.36            1.5     0.030
## 1301           9.0            0.245        0.38            5.9     0.045
## 1486           7.4            0.280        0.49            1.5     0.034
## 2522           6.5            0.180        0.33            1.4     0.029
## 6415           6.1            0.320        0.25            2.3     0.071
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 3181                  31                   90 0.98926 3.24      0.36    12.3
## 2875                  74                  121 0.98976 3.24      0.99    12.1
## 1301                  52                  159 0.99500 2.93      0.35    10.2
## 1486                  20                  126 0.99180 2.98      0.39    10.6
## 2522                  35                  138 0.99114 3.36      0.60    11.5
## 6415                  23                   58 0.99633 3.42      0.97    10.6
##      qualityvariable
## 3181               1
## 2875               1
## 1301               1
## 1486               1
## 2522               1
## 6415               0
fitted.results <- predict(logitmodel,newdata=subset(wine_test,select=c(1,2,3,4,5,6,8,9,10,11)),type='response')
fitted.results_val <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results_val != wine_test$qualityvariable)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.729465824238943"
library(car)
## Loading required package: carData
vif(logitmodel)
##    volatile.acidity         citric.acid      residual.sugar free.sulfur.dioxide 
##            1.456212            1.363621            1.437728            1.432348 
##                  pH           sulphates             alcohol 
##            1.292089            1.231206            1.179129
length(fitted.results)
## [1] 1741
length(wine_test$qualityvariable)
## [1] 1741
confusionMatrix(as.factor(fitted.results_val), as.factor(wine_test$qualityvariable))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 354 168
##          1 303 916
##                                           
##                Accuracy : 0.7295          
##                  95% CI : (0.7079, 0.7502)
##     No Information Rate : 0.6226          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4             
##                                           
##  Mcnemar's Test P-Value : 6.642e-10       
##                                           
##             Sensitivity : 0.5388          
##             Specificity : 0.8450          
##          Pos Pred Value : 0.6782          
##          Neg Pred Value : 0.7514          
##              Prevalence : 0.3774          
##          Detection Rate : 0.2033          
##    Detection Prevalence : 0.2998          
##       Balanced Accuracy : 0.6919          
##                                           
##        'Positive' Class : 0               
## 
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
wine_test$prob=fitted.results
h <- roc(qualityvariable~prob, data=wine_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(h) # area-under-curve prefer 0.8 or higher.
## Area under the curve: 0.8014
plot(h)

3.4 Feature Selection for Logistic Regression

library(leaps)
reg.leaps <- regsubsets(qualityvariable~ volatile.acidity + citric.acid + 
    residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol, data = wine_training, nbest = 1, method = "exhaustive")  # leaps, 
plot(reg.leaps, scale = "adjr2", main = "Adjusted R^2")

plot(reg.leaps, scale = "bic", main = "BIC")

plot(reg.leaps, scale = "Cp", main = "Cp")

library(bestglm)
head(wine_test)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 5017           8.8             0.55        0.04            2.2     0.119
## 6442          11.1             0.44        0.42            2.2     0.064
## 752            6.9             0.20        0.50           10.0     0.036
## 5243          11.9             0.57        0.50            2.6     0.082
## 2299           6.3             0.41        0.18            3.5     0.027
## 1064           6.7             0.26        0.26            4.1     0.073
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 5017                  14                   56 0.99620 3.21      0.60    10.9
## 6442                  14                   19 0.99758 3.25      0.57    10.4
## 752                   78                  167 0.99640 3.15      0.55    10.2
## 5243                   6                   32 1.00060 3.12      0.78    10.7
## 2299                  23                  109 0.99018 3.34      0.54    12.8
## 1064                  36                  202 0.99560 3.30      0.67     9.5
##      qualityvariable      prob
## 5017               1 0.5538146
## 6442               1 0.5175007
## 752                1 0.7997941
## 5243               1 0.5306657
## 2299               1 0.9363355
## 1064               0 0.6106754
wine_test_1 = wine_test[,c(-7)]
head(wine_test_1)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 5017           8.8             0.55        0.04            2.2     0.119
## 6442          11.1             0.44        0.42            2.2     0.064
## 752            6.9             0.20        0.50           10.0     0.036
## 5243          11.9             0.57        0.50            2.6     0.082
## 2299           6.3             0.41        0.18            3.5     0.027
## 1064           6.7             0.26        0.26            4.1     0.073
##      free.sulfur.dioxide density   pH sulphates alcohol qualityvariable
## 5017                  14 0.99620 3.21      0.60    10.9               1
## 6442                  14 0.99758 3.25      0.57    10.4               1
## 752                   78 0.99640 3.15      0.55    10.2               1
## 5243                   6 1.00060 3.12      0.78    10.7               1
## 2299                  23 0.99018 3.34      0.54    12.8               1
## 1064                  36 0.99560 3.30      0.67     9.5               0
##           prob
## 5017 0.5538146
## 6442 0.5175007
## 752  0.7997941
## 5243 0.5306657
## 2299 0.9363355
## 1064 0.6106754
res.bestglm <- bestglm(Xy = wine_test_1,
            IC = "AIC",                 # Information criteria for
            method = "exhaustive")
summary(res.bestglm)
## Fitting algorithm:  AIC-leaps
## Best Model:
##              df   deviance
## Null Model 1729   4.122665
## Full Model 1740 105.525333
## 
##  likelihood-ratio test - GLM
## 
## data:  H0: Null Model vs. H1: Best Fit AIC-leaps
## X = 101.4, df = 11, p-value < 2.2e-16
res.bestglm$BestModels
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1          TRUE             TRUE        TRUE           TRUE      TRUE
## 2          TRUE             TRUE        TRUE           TRUE     FALSE
## 3          TRUE             TRUE        TRUE           TRUE      TRUE
## 4          TRUE             TRUE        TRUE           TRUE     FALSE
## 5          TRUE             TRUE        TRUE           TRUE      TRUE
##   free.sulfur.dioxide density   pH sulphates alcohol qualityvariable Criterion
## 1                TRUE    TRUE TRUE      TRUE    TRUE            TRUE -10503.59
## 2                TRUE    TRUE TRUE      TRUE    TRUE            TRUE -10503.58
## 3                TRUE   FALSE TRUE      TRUE    TRUE            TRUE -10491.27
## 4                TRUE   FALSE TRUE      TRUE    TRUE            TRUE -10487.81
## 5                TRUE    TRUE TRUE      TRUE    TRUE           FALSE -10486.94

4 Conclusion

Finally, we can see that the Logistic Regression model performed well on the dataset. Even though we only have a score of 73 %, the ROC was ~80.7 %, which makes this a pretty solid model. We can also see that in the KNN model, the accuracy was ~74%, but it is contradictory with the ROC score as it is not satisfactory. Hence, we didn’t take the accuracy of KNN into consideration.